library(dplyr); library(reshape2); library(ggplot2)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
filter_zillow_data <- function(dat, date_start=7){
  # Only new york
  dat <- filter(dat, State=="NY",  City=="New York")

  # Prepare for plotting
  dat_m <- melt(dat, id.vars=c("RegionName"),
                measure.vars=colnames(dat)[date_start:length(colnames(dat))])
  
  # Make more date-ey, paste on a day to make it unambiguous
  dat_m$variable <- sapply(as.character(dat_m$variable),
                           function(x) paste(substr(x, 2, nchar(x)),".01",
                                             sep="") )
  dat_m$date <- as.Date(dat_m$variable, format="%Y.%m.%d")
  return(dat_m)
}

The Zillow Data

Median Listing Prices by Neighborhood

Neighborhood_MedianListingPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianListingPrice_AllHomes.csv")

dat_med_listing <- filter_zillow_data(Neighborhood_MedianListingPrice_AllHomes)

ggplot(data=dat_med_listing,
       aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  coord_cartesian(ylim=c(200000,2000000)) +
  ggtitle("Median Listing Prices in New York Neighborhoods") +
  labs(x="Year", y="Median Listing Price (All Homes)")

Median Sold Prices by Neighborhood

Neighborhood_MedianSoldPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianSoldPrice_AllHomes.csv")

dat_med_sold <- filter_zillow_data(Neighborhood_MedianSoldPrice_AllHomes, date_start=8)

ggplot(data=dat_med_sold,
       aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  coord_cartesian(ylim=c(200000,2000000)) +
  ggtitle("Median selling Prices in New York Neighborhoods") +
  labs(x="Year", y="Median Sold Price (All Homes)")

So I wasn’t thinking about this, but perhaps I could do predictions of the difference between listing and selling price by area?

This type of data is essentiall identical - would be interesting to see if there is a large difference

median_sell = dat_med_sold %>% group_by(date) %>% summarize("Med_Sell"=median(value, na.rm=TRUE))

median_list = dat_med_listing %>% group_by(date) %>% summarize("Med_Listing"=median(value, na.rm=TRUE))

medians <- merge(median_sell, median_list, by="date")
medians$Difference <- medians$Med_Sell - medians$Med_Listing
medians <- melt(medians, id.vars="date")
ggplot(data=filter(medians, variable != "Difference"), aes(x=date, y=value, group=variable, col=variable)) + 
  geom_line() + 
  geom_smooth() +
  labs(x="Year", y="Price ($)") +
  ggtitle("Difference in Median Listing and Sale Price for All NY Areas") 

What about each area separately?

Need to interpolate/predict missing values!

medians_all <- merge(dat_med_listing, dat_med_sold, by=c("RegionName","date"))
medians_all <- select(medians_all, RegionName, date, value.x, value.y)
colnames(medians_all) <- c("RegionName", "date", "ListPrice","SoldPrice")
medians_all$Difference <- medians_all$ListPrice - medians_all$SoldPrice
ggplot(data=medians_all, aes(x=date, y=Difference, group=RegionName, col=RegionName)) +
  geom_line(alpha=0.6) +
  coord_cartesian(ylim=c(-500000,1000000)) +
  guides(col=FALSE) +
  labs(x="Year", y="List Price - Sold Price") +
  ggtitle("Difference in Median Listing and Sale Price By Areas")

Median Rental Price for 1BR

# Read data
Neighborhood_MedianRentalPrice_1Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianRentalPrice_1Bedroom.csv")

dat_med_rental_1br <- filter_zillow_data(Neighborhood_MedianRentalPrice_1Bedroom)
#There is a super jaggy data point. We can find it by looking for the point with the ridiculous SD
data.frame(dat_med_rental_1br  %>% 
  group_by(RegionName) %>% 
  summarize("SD"=sd(value, na.rm=TRUE)) %>%
  arrange(desc(SD)))[1,]
##    RegionName       SD
## 1 Murray Hill 871.2724
dat_med_rental_1br <- filter(dat_med_rental_1br, RegionName != "Murray Hill")

ggplot(data=dat_med_rental_1br, aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  ggtitle("Median Rental Prices in New York Neighborhoods (1BR)") +
  labs(x="Year", y="Median Rental Price (1 BR Homes)")

# Lets look at the change in rental price over quarters
dat_med_rental_1br$lagged_val <- lag(dat_med_rental_1br$value)
dat_med_rental_1br$detrend_val <- dat_med_rental_1br$val - dat_med_rental_1br$lagged_val

ggplot(data=dat_med_rental_1br, aes(x=date, y=detrend_val, group=RegionName, col=RegionName)) +
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  ggtitle("De-trended Median Rental Prices in New York Neighborhoods (1BR)") +
  labs(x="Year", y="De-trended Median Rental Price (1 BR Homes)")

Median Rental Price for 2BR

Neighborhood_MedianRentalPrice_2Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianRentalPrice_2Bedroom.csv")

dat_med_rental_2br <- filter_zillow_data(Neighborhood_MedianRentalPrice_2Bedroom, date_start=7)

dat_med_rental_2br <- filter(dat_med_rental_2br, RegionName != "Murray Hill")

ggplot(data=dat_med_rental_2br, aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  ggtitle("Median Rental Prices in New York Neighborhoods (2BR)") +
  labs(x="Year", y="Median Rental Price (2 BR Homes)")

Median Rental Price for 3BR

Neighborhood_MedianRentalPrice_3Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianRentalPrice_3Bedroom.csv")

dat_med_rental_3br <- filter_zillow_data(Neighborhood_MedianRentalPrice_3Bedroom, date_start=7)

dat_med_rental_3br <- filter(dat_med_rental_3br, RegionName != "Murray Hill")

ggplot(data=dat_med_rental_3br, aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  ggtitle("Median Rental Prices in New York Neighborhoods (3BR)") +
  labs(x="Year", y="Median Rental Price (3 BR Homes)")

Median Rental Price for 4BR

Neighborhood_MedianRentalPrice_4Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianRentalPrice_4Bedroom.csv")

dat_med_rental_4br <- filter_zillow_data(Neighborhood_MedianRentalPrice_4Bedroom, date_start=7)

dat_med_rental_4br <- filter(dat_med_rental_4br, RegionName != "Murray Hill")

ggplot(data=dat_med_rental_4br, aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  #guides(col=FALSE) +
  ggtitle("Median Rental Prices in New York Neighborhoods (4BR)") +
  labs(x="Year", y="Median Rental Price (4 BR Homes)")

Median Rental Price for All Homes

Neighborhood_MedianRentalPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/Neighborhood_MedianRentalPrice_AllHomes.csv")

dat_med_rental_AllHomes <- filter_zillow_data(Neighborhood_MedianRentalPrice_AllHomes, date_start=7)

dat_med_rental_AllHomes <- filter(dat_med_rental_AllHomes, RegionName != "Murray Hill")

ggplot(data=dat_med_rental_AllHomes, aes(x=date, y=value, group=RegionName, col=RegionName)) + 
  geom_line(alpha=0.5) +
  guides(col=FALSE) +
  ggtitle("Median Rental Prices in New York Neighborhoods (All)") +
  labs(x="Year", y="Median Rental Price (All Homes)")